Importing Libraries¶
#Importing Libraries for ML
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy.stats import ttest_ind
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.feature_selection import mutual_info_classif
import warnings
warnings.filterwarnings("ignore")
Importing Cleaned Data¶
df = pd.read_csv('cleaned_data.csv')
df.head(3)
| name | MDVP:Fo(Hz) | MDVP:Fhi(Hz) | MDVP:Flo(Hz) | MDVP:Jitter(%) | MDVP:Jitter(Abs) | MDVP:RAP | MDVP:PPQ | Jitter:DDP | MDVP:Shimmer | ... | Shimmer:DDA | NHR | HNR | status | RPDE | DFA | spread1 | spread2 | D2 | PPE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 119.992 | 157.302 | 74.997 | 0.00784 | 0.00007 | 0.00370 | 0.00554 | 0.01109 | 0.04374 | ... | 0.06545 | 0.02211 | 21.033 | 1.0 | 0.414783 | 0.815285 | -4.813031 | 0.266482 | 2.301442 | 0.284654 |
| 1 | 1 | 122.400 | 148.650 | 113.819 | 0.00968 | 0.00008 | 0.00465 | 0.00696 | 0.01394 | 0.06134 | ... | 0.09403 | 0.01929 | 19.085 | 1.0 | 0.458359 | 0.819521 | -4.075192 | 0.335590 | 2.486855 | 0.368674 |
| 2 | 2 | 116.682 | 131.111 | 111.555 | 0.01050 | 0.00009 | 0.00544 | 0.00781 | 0.01633 | 0.05233 | ... | 0.08270 | 0.01309 | 20.651 | 1.0 | 0.429895 | 0.825288 | -4.443179 | 0.311173 | 2.342259 | 0.332634 |
3 rows × 24 columns
Data Exploration (EDA)¶
- Univariate analysis on numerical data
- Bivariate analysis on numerical data vs numerical data
- Multivariate analysis on numerical data
Univariate analysis on numerical data¶
Descriptive Statistics¶
# Function to create dataframe for columns and its unique count of values, datatypes and type of columns
def find_categorical_continous_variables(data, threashold):
dic = {}
continus_cols = []
categorical_cols = []
for column in data.columns:
li = []
li.append(len(data[column].unique()))
li.append(data[column].dtype)
# Check if the column is categorical or continuous
if len(data[column].unique()) <= threashold or data[column].dtype == 'O':
li.append("Categorical column")
categorical_cols.append(column)
else:
li.append("Continous Column")
continus_cols.append(column)
dic[column] = li
# create dataframe for columns and its details
dic_df = pd.DataFrame(dic, index=["Unique values", "Data Type", "Categorical/Continous"])
# return the dataframe, and lists for continous and categorical columns
return dic_df.T, categorical_cols, continus_cols
# Function to find descriptive statistics by providing it the contious cols and dataframe
def descriptive_statistics_continous(df, continus_cols):
dic={}
for col in continus_cols:
dic[col] = []
dic[col].append(df[col].mean())
dic[col].append(df[col].median())
dic[col].append(df[col].mode()[0])
dic[col].append(df[col].std())
dic[col].append(df[col].var())
dic[col].append(df[col].max() - df[col].min())
dic[col].append(df[col].quantile(0.75) - df[col].quantile(0.25) )
dic[col].append(df[col].skew())
dic[col].append(df[col].kurtosis())
df_details = pd.DataFrame(dic, index=["Mean", "Median", "Mode", "Std Deviation", "Variance", "Range", "IQR", "Skew", "Kurtosis"])
return df_details.T
# lets find out the unique counts, datatypes, variable type like continous/categorical and lists for columns names having continous/categorical columns
col_type_df, categorical_cols, continus_cols = find_categorical_continous_variables(df, 10)
# display the data
col_type_df
| Unique values | Data Type | Categorical/Continous | |
|---|---|---|---|
| name | 189 | int64 | Continous Column |
| MDVP:Fo(Hz) | 189 | float64 | Continous Column |
| MDVP:Fhi(Hz) | 189 | float64 | Continous Column |
| MDVP:Flo(Hz) | 189 | float64 | Continous Column |
| MDVP:Jitter(%) | 168 | float64 | Continous Column |
| MDVP:Jitter(Abs) | 18 | float64 | Continous Column |
| MDVP:RAP | 149 | float64 | Continous Column |
| MDVP:PPQ | 162 | float64 | Continous Column |
| Jitter:DDP | 174 | float64 | Continous Column |
| MDVP:Shimmer | 182 | float64 | Continous Column |
| MDVP:Shimmer(dB) | 145 | float64 | Continous Column |
| Shimmer:APQ3 | 180 | float64 | Continous Column |
| Shimmer:APQ5 | 183 | float64 | Continous Column |
| MDVP:APQ | 183 | float64 | Continous Column |
| Shimmer:DDA | 184 | float64 | Continous Column |
| NHR | 181 | float64 | Continous Column |
| HNR | 189 | float64 | Continous Column |
| status | 2 | float64 | Categorical column |
| RPDE | 189 | float64 | Continous Column |
| DFA | 189 | float64 | Continous Column |
| spread1 | 189 | float64 | Continous Column |
| spread2 | 188 | float64 | Continous Column |
| D2 | 189 | float64 | Continous Column |
| PPE | 189 | float64 | Continous Column |
# print columns
print("Categorical Columns -\n",categorical_cols, "\n")
print("Continous Columns -\n",continus_cols)
Categorical Columns - ['status'] Continous Columns - ['name', 'MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)', 'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'spread1', 'spread2', 'D2', 'PPE']
Conclusion -¶
We have two categorical variables i.e. ['name', 'status']
We have 22 Continous variables i.e. ['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)', 'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'spread1', 'spread2', 'D2', 'PPE']
# Get the descriptive statistics
descriptive_statistics_continous(df, continus_cols)
| Mean | Median | Mode | Std Deviation | Variance | Range | IQR | Skew | Kurtosis | |
|---|---|---|---|---|---|---|---|---|---|
| name | 94.000000 | 94.000000 | 0.000000 | 54.703748 | 2.992500e+03 | 188.000000 | 94.000000 | 0.000000 | -1.200000 |
| MDVP:Fo(Hz) | 154.605011 | 148.790000 | 88.333000 | 41.525291 | 1.724350e+03 | 171.772000 | 65.557000 | 0.592528 | -0.633226 |
| MDVP:Fhi(Hz) | 198.026812 | 176.595000 | 102.145000 | 92.490876 | 8.554562e+03 | 489.885000 | 88.913000 | 2.515378 | 7.401087 |
| MDVP:Flo(Hz) | 115.305561 | 102.874000 | 1.000000 | 44.653702 | 1.993953e+03 | 238.170000 | 51.701000 | 1.130111 | 0.734658 |
| MDVP:Jitter(%) | 0.006093 | 0.004940 | 0.003690 | 0.004558 | 2.077129e-05 | 0.031480 | 0.003810 | 3.050555 | 12.379222 |
| MDVP:Jitter(Abs) | 0.000043 | 0.000030 | 0.000030 | 0.000034 | 1.157854e-09 | 0.000253 | 0.000030 | 2.775989 | 12.202619 |
| MDVP:RAP | 0.003236 | 0.002470 | 0.001690 | 0.002807 | 7.878800e-06 | 0.020760 | 0.002070 | 3.408734 | 15.450912 |
| MDVP:PPQ | 0.066817 | 0.002630 | 0.003320 | 0.872632 | 7.614869e-01 | 12.000000 | 0.002040 | 13.747551 | 188.996748 |
| Jitter:DDP | 0.009688 | 0.007420 | 0.005070 | 0.008418 | 7.085477e-05 | 0.062290 | 0.006180 | 3.420839 | 15.523735 |
| MDVP:Shimmer | 0.029264 | 0.022960 | 0.014500 | 0.017908 | 3.207078e-04 | 0.084650 | 0.021150 | 1.420081 | 1.657132 |
| MDVP:Shimmer(dB) | 0.282175 | 0.221000 | 0.154000 | 0.189523 | 3.591883e-02 | 0.933000 | 0.201000 | 1.704636 | 2.910687 |
| Shimmer:APQ3 | 0.015492 | 0.012770 | 0.004690 | 0.009823 | 9.649054e-05 | 0.050830 | 0.012010 | 1.475233 | 2.208598 |
| Shimmer:APQ5 | 0.017617 | 0.013430 | 0.007470 | 0.011290 | 1.274623e-04 | 0.049860 | 0.012740 | 1.446952 | 1.428675 |
| MDVP:APQ | 0.023583 | 0.017990 | 0.009030 | 0.015002 | 2.250460e-04 | 0.080890 | 0.016220 | 1.660942 | 2.946433 |
| Shimmer:DDA | 0.046476 | 0.038310 | 0.015670 | 0.029468 | 8.683737e-04 | 0.152510 | 0.036020 | 1.475234 | 2.208833 |
| NHR | 0.023823 | 0.011660 | 0.002310 | 0.038322 | 1.468603e-03 | 0.314170 | 0.018900 | 4.505491 | 25.951523 |
| HNR | 21.961164 | 22.085000 | 8.867000 | 4.268406 | 1.821929e+01 | 24.180000 | 5.763000 | -0.445051 | 0.571981 |
| RPDE | 0.497594 | 0.491345 | 0.256570 | 0.103133 | 1.063635e-02 | 0.428581 | 0.163781 | -0.129763 | -0.909410 |
| DFA | 0.717117 | 0.722085 | 0.574282 | 0.054975 | 3.022233e-03 | 0.251006 | 0.086725 | -0.039189 | -0.663940 |
| spread1 | -5.725437 | -5.736781 | -7.964984 | 1.051205 | 1.105032e+00 | 5.125228 | 1.320026 | 0.347205 | -0.210744 |
| spread2 | 0.223666 | 0.217013 | 0.210279 | 0.081229 | 6.598211e-03 | 0.428052 | 0.099760 | 0.074474 | -0.130574 |
| D2 | 2.376950 | 2.361532 | 1.423287 | 0.368745 | 1.359730e-01 | 1.990362 | 0.531619 | 0.312325 | -0.072176 |
| PPE | 0.203052 | 0.192730 | 0.044539 | 0.086461 | 7.475510e-03 | 0.412994 | 0.113350 | 0.721347 | 0.261612 |
Conclusion -¶
We can see the descriptive statistical inferences like above
def plot_distribution_numerical(df):
# Lets plot the histogram for each variable
features = df.select_dtypes(include='number').columns
# plot the subplot for histogram of each variable
fig, axs = plt.subplots(23, 3, figsize=(20, 60))
row = 0
for feature in features:
axs[row, 0].set_title("Histogram for {}".format(feature))
sns.histplot(data=df, x=feature, kde=True, color="red", ax=axs[row, 0])
axs[row, 1].set_title("Boxplot for {}".format(feature))
sns.boxplot(data=df, x=feature, color="skyblue", ax=axs[row, 1])
axs[row, 2].set_title("Vaiolinplot for {}".format(feature))
sns.violinplot(data=df, x=feature, color="lightgreen", ax=axs[row, 2])
row = row+1
plt.title("Histogram, Boxplot and Violinplots for all variables")
plt.tight_layout()
plt.show()
plot_distribution_numerical(df)
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) Cell In[8], line 24 21 plt.tight_layout() 22 plt.show() ---> 24 plot_distribution_numerical(df) Cell In[8], line 10, in plot_distribution_numerical(df) 7 row = 0 8 for feature in features: ---> 10 axs[row, 0].set_title("Histogram for {}".format(feature)) 11 sns.histplot(data=df, x=feature, kde=True, color="red", ax=axs[row, 0]) 13 axs[row, 1].set_title("Boxplot for {}".format(feature)) IndexError: index 23 is out of bounds for axis 0 with size 23
Conclusion -¶
- As per the skewness, kurtosis and above histogram, boxplots and violinplots we can see that below columns are normally distributed -
DFA, spread1, spread2, D2 and PPE
- Below columns are skewed to the right
'MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Jitter(%)', 'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP', 'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 'MDVP:APQ', 'Shimmer:DDA'
Bivariate analysis on numerical data¶
# Covariance for pair of variables
def covariance_of_pair_variables(df):
features = df.select_dtypes(include='number').columns
dic_covariance = {}
for index1 in range(0, len(features)):
for index2 in range(index1+1, len(features)):
dic_covariance[features[index1] + " - " + features[index2]] = df[features[index1]].cov(df[features[index2]])
covariance_df = pd.DataFrame(dic_covariance, index=["Covariance"]).T
return covariance_df.sort_values(by='Covariance')
covariance_of_pair_variables(df)
| Covariance | |
|---|---|
| name - MDVP:Flo(Hz) | -143.088761 |
| name - HNR | -48.473436 |
| MDVP:Fo(Hz) - spread1 | -19.036930 |
| MDVP:Flo(Hz) - spread1 | -18.886935 |
| MDVP:Flo(Hz) - status | -7.539918 |
| ... | ... |
| name - MDVP:Fo(Hz) | 81.688888 |
| MDVP:Fhi(Hz) - MDVP:Flo(Hz) | 351.507050 |
| name - MDVP:Fhi(Hz) | 706.529244 |
| MDVP:Fo(Hz) - MDVP:Flo(Hz) | 1081.942775 |
| MDVP:Fo(Hz) - MDVP:Fhi(Hz) | 1511.889706 |
276 rows × 1 columns
Conclusion -¶
Below pair of variables are having the lowest covariance -
- MDVP:Fo(Hz) - spread1
- MDVP:Flo(Hz) - spread1
- MDVP:Fhi(Hz) - spread1
- MDVP:Flo(Hz) - status
- MDVP:Fo(Hz) - status
Below pair of variables are having the highest covariance -
- MDVP:Fo(Hz) - HNR
- MDVP:Flo(Hz) - HNR
- MDVP:Fhi(Hz) - MDVP:Flo(Hz)
- MDVP:Fo(Hz) - MDVP:Flo(Hz)
- MDVP:Fo(Hz) - MDVP:Fhi(Hz)
# Correlations matrix
plt.figure(figsize=(15, 15))
sns.heatmap(df.select_dtypes(include='number').corr(), annot=True, cmap='coolwarm')
<Axes: >
# plotting the scatterplots for all variables combinations
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x1bb42c42dd0>
Conclusions -¶
As per the scatterplot and heatmap presented we can see that below variables are highly correlated to each other
- 'MDVP:Fo(Hz)' => 'MDVP:Fhi(Hz)'
- 'MDVP:Fo(Hz)' => 'MDVP:Flo(Hz)'
- 'MDVP:Flo(Hz)' => 'MDVP:Fhi(Hz)'
- 'MDVP:RAP' => 'MDVP:Jitter(%)'
- 'MDVP:Jitter(%)'=> 'Jitter:DDP'
- 'MDVP:Jitter(%)'=> 'MDVP:PPQ'
- 'MDVP:Jitter(%)'=> 'MDVP:RAP'
- 'MDVP:Jitter(%)'=> 'NHR'
- 'MDVP:Shimmer' => 'MDVP:Shimmer',
- 'MDVP:Shimmer' => 'MDVP:Shimmer(dB)',
- 'MDVP:Shimmer' => 'Shimmer:DDA',
- 'MDVP:Shimmer' => 'Shimmer:APQ5',
- 'MDVP:Shimmer' => 'NHR',
- 'MDVP:Shimmer' => 'Shimmer:APQ3',
- 'MDVP:Shimmer' => 'HNR'
- 'MDVP:Shimmer' => 'MDVP:APQ',
def t_test_for_two_groups(df, target_var, first_target, second_target):
for feature in df.select_dtypes(include='number').columns:
# Lets have two groups one for datapoints when "status is 0" and another when "status is 1"
group_1 = df[df[target_var]==first_target][feature]
group_0 = df[df[target_var]==second_target][feature]
# Perform independent samples t-test
t_statistic, p_value = ttest_ind(group_1, group_0)
# Print the results
print("\n---------{} for two outcomes of {}-------------\n".format(feature, target_var))
print("T-statistic:", t_statistic)
print("P-value:", p_value)
# Interpret the results
alpha = 0.05
if p_value < alpha:
print("Reject the null hypothesis: There is a significant difference in means of the two data groups of {} when {} is {} and {}".format(feature, target_var, first_target, second_target))
else:
print("Fail to reject the null hypothesis: There is NO significant difference in means of the two data groups of {} when {} is {} and {}".format(feature, target_var, first_target, second_target))
t_test_for_two_groups(df, 'status', 1, 0)
---------name for two outcomes of status------------- T-statistic: -2.618996447591034 P-value: 0.009542284438269866 Reject the null hypothesis: There is a significant difference in means of the two data groups of name when status is 1 and 0 ---------MDVP:Fo(Hz) for two outcomes of status------------- T-statistic: -5.447839322812917 P-value: 1.592227189146496e-07 Reject the null hypothesis: There is a significant difference in means of the two data groups of MDVP:Fo(Hz) when status is 1 and 0 ---------MDVP:Fhi(Hz) for two outcomes of status------------- T-statistic: -2.105966006139477 P-value: 0.03654192080671688 Reject the null hypothesis: There is a significant difference in means of the two data groups of MDVP:Fhi(Hz) when status is 1 and 0 ---------MDVP:Flo(Hz) for two outcomes of status------------- T-statistic: -5.692141471654098 P-value: 4.784923852241762e-08 Reject the null hypothesis: There is a significant difference in means of the two data groups of MDVP:Flo(Hz) when status is 1 and 0 ---------MDVP:Jitter(%) for two outcomes of status------------- T-statistic: 4.0859747918809335 P-value: 6.506825054289377e-05 Reject the null hypothesis: There is a significant difference in means of the two data groups of MDVP:Jitter(%) when status is 1 and 0 ---------MDVP:Jitter(Abs) for two outcomes of status------------- T-statistic: 4.8761944285090975 P-value: 2.3037111641167024e-06 Reject the null hypothesis: There is a significant difference in means of the two data groups of MDVP:Jitter(Abs) when status is 1 and 0 ---------MDVP:RAP for two outcomes of status------------- T-statistic: 3.8768349749731166 P-value: 0.0001463942426461765 Reject the null hypothesis: There is a significant difference in means of the two data groups of MDVP:RAP when status is 1 and 0 ---------MDVP:PPQ for two outcomes of status------------- T-statistic: 0.6027388372559829 P-value: 0.5474126882565669 Fail to reject the null hypothesis: There is NO significant difference in means of the two data groups of MDVP:PPQ when status is 1 and 0 ---------Jitter:DDP for two outcomes of status------------- T-statistic: 3.856292612294128 P-value: 0.0001582581388863351 Reject the null hypothesis: There is a significant difference in means of the two data groups of Jitter:DDP when status is 1 and 0 ---------MDVP:Shimmer for two outcomes of status------------- T-statistic: 5.7197069054907725 P-value: 4.168890849126019e-08 Reject the null hypothesis: There is a significant difference in means of the two data groups of MDVP:Shimmer when status is 1 and 0 ---------MDVP:Shimmer(dB) for two outcomes of status------------- T-statistic: 5.5083786926335705 P-value: 1.1858552827842587e-07 Reject the null hypothesis: There is a significant difference in means of the two data groups of MDVP:Shimmer(dB) when status is 1 and 0 ---------Shimmer:APQ3 for two outcomes of status------------- T-statistic: 5.300447394450554 P-value: 3.2329358091596845e-07 Reject the null hypothesis: There is a significant difference in means of the two data groups of Shimmer:APQ3 when status is 1 and 0 ---------Shimmer:APQ5 for two outcomes of status------------- T-statistic: 5.5091649170951404 P-value: 1.1813090095950779e-07 Reject the null hypothesis: There is a significant difference in means of the two data groups of Shimmer:APQ5 when status is 1 and 0 ---------MDVP:APQ for two outcomes of status------------- T-statistic: 6.077737643304644 P-value: 6.6971916263062925e-09 Reject the null hypothesis: There is a significant difference in means of the two data groups of MDVP:APQ when status is 1 and 0 ---------Shimmer:DDA for two outcomes of status------------- T-statistic: 5.300281352950314 P-value: 3.2354923539354584e-07 Reject the null hypothesis: There is a significant difference in means of the two data groups of Shimmer:DDA when status is 1 and 0 ---------NHR for two outcomes of status------------- T-statistic: 2.647959426829298 P-value: 0.00878949536615006 Reject the null hypothesis: There is a significant difference in means of the two data groups of NHR when status is 1 and 0 ---------HNR for two outcomes of status------------- T-statistic: -5.409563773613764 P-value: 1.9161398150073093e-07 Reject the null hypothesis: There is a significant difference in means of the two data groups of HNR when status is 1 and 0 ---------status for two outcomes of status------------- T-statistic: inf P-value: 0.0 Reject the null hypothesis: There is a significant difference in means of the two data groups of status when status is 1 and 0 ---------RPDE for two outcomes of status------------- T-statistic: 4.350125625164272 P-value: 2.2342434057464583e-05 Reject the null hypothesis: There is a significant difference in means of the two data groups of RPDE when status is 1 and 0 ---------DFA for two outcomes of status------------- T-statistic: 3.319606465498123 P-value: 0.0010836705446395163 Reject the null hypothesis: There is a significant difference in means of the two data groups of DFA when status is 1 and 0 ---------spread1 for two outcomes of status------------- T-statistic: 9.492051691133758 P-value: 1.0853373799969717e-17 Reject the null hypothesis: There is a significant difference in means of the two data groups of spread1 when status is 1 and 0 ---------spread2 for two outcomes of status------------- T-statistic: 7.067310887483291 P-value: 3.0381818141581874e-11 Reject the null hypothesis: There is a significant difference in means of the two data groups of spread2 when status is 1 and 0 ---------D2 for two outcomes of status------------- T-statistic: 5.3974150453986125 P-value: 2.0317665984241093e-07 Reject the null hypothesis: There is a significant difference in means of the two data groups of D2 when status is 1 and 0 ---------PPE for two outcomes of status------------- T-statistic: 8.695379609608944 P-value: 1.7455826911011754e-15 Reject the null hypothesis: There is a significant difference in means of the two data groups of PPE when status is 1 and 0
Multivariate analysis on numerical data¶
from sklearn import datasets, decomposition
# Split the DataFrame into X and Y datasets
# Features (independent variables)
X = df.drop(columns=['status', 'name'])
# Target variable (dependent variable)
y = df['status']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
#Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
#Combine PCA Components and Target Variable
df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
df['Target'] = y
#Apply Clustering Algorithm
kmeans = KMeans(n_clusters=2, random_state=42)
df['Cluster'] = kmeans.fit_predict(df[['PC1', 'PC2', 'Target']])
# Visualize clustering results
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='PC1', y='PC2', hue='Cluster', style='Target', palette='viridis')
plt.title('PCA Clustering based on Target Variable')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.show()